Lab Practice 1: Decomposition Methods

Preliminaries

Load libraries

If you get an error go to the packages panel, click install and type the name of the library (you only need to do this once).

library(fpp2) 
library(tidyverse)

Set working directory

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

Reading time series data from csv files

Load dataset

fdata <- read_csv("Unemployment.dat")
## Rows: 156 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): DATE
## dbl (1): TOTAL
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(fdata)
## # A tibble: 6 × 2
##   DATE         TOTAL
##   <chr>        <dbl>
## 1 01/01/2010 4048493
## 2 01/02/2010 4130625
## 3 01/03/2010 4166613
## 4 01/04/2010 4142425
## 5 01/05/2010 4066202
## 6 01/06/2010 3982368
glimpse(fdata)
## Rows: 156
## Columns: 2
## $ DATE  <chr> "01/01/2010", "01/02/2010", "01/03/2010", "01/04/2010", "01/05/2…
## $ TOTAL <dbl> 4048493, 4130625, 4166613, 4142425, 4066202, 3982368, 3908578, 3…

QUESTION:

What is the frequency of observation for this data. Is this data yearly? (or quarterly, monthly, weekly, daily,…)

Working with dates in R

Convert character columns to R date type

fdata$DATE <- as.Date(fdata$DATE, format = "%d/%m/%Y")
head(fdata)
## # A tibble: 6 × 2
##   DATE         TOTAL
##   <date>       <dbl>
## 1 2010-01-01 4048493
## 2 2010-02-01 4130625
## 3 2010-03-01 4166613
## 4 2010-04-01 4142425
## 5 2010-05-01 4066202
## 6 2010-06-01 3982368

Order the table by date (using the pipe operator)

fdata <- fdata %>% arrange(DATE)

# same as: fdata <- arrange(fdata,DATE)

Check for missing dates (time gaps)

How do we know if there are time gaps in the data?

range(fdata$DATE)
## [1] "2010-01-01" "2022-12-01"
min(fdata$DATE)
## [1] "2010-01-01"
max(fdata$DATE)
## [1] "2022-12-01"

Therefore we can create a complete sequence of months with the same range and compare it to the dates in our data.

date_range <- seq.Date(min(fdata$DATE), max(fdata$DATE), by = "months")

head(date_range)
## [1] "2010-01-01" "2010-02-01" "2010-03-01" "2010-04-01" "2010-05-01"
## [6] "2010-06-01"
tail(date_range)
## [1] "2022-07-01" "2022-08-01" "2022-09-01" "2022-10-01" "2022-11-01"
## [6] "2022-12-01"

Now we do the comparison

date_range[!date_range %in% fdata$DATE] 
## Date of length 0

To practice this, let us create a new dataset missing one date.

fdata2 <- fdata[c(1:4, 6:nrow(fdata)), ]
head(fdata2)
## # A tibble: 6 × 2
##   DATE         TOTAL
##   <date>       <dbl>
## 1 2010-01-01 4048493
## 2 2010-02-01 4130625
## 3 2010-03-01 4166613
## 4 2010-04-01 4142425
## 5 2010-06-01 3982368
## 6 2010-07-01 3908578

Now if you repeat the comparison

date_range[!(date_range %in% fdata2$DATE)] 
## [1] "2010-05-01"

Check for missing data (NA) in time series value columns

sum(is.na(fdata$TOTAL))
## [1] 0

Again, let us see an example with missing data

fdata2 <- fdata
fdata2$TOTAL[5] = NA
head(fdata2)
## # A tibble: 6 × 2
##   DATE         TOTAL
##   <date>       <dbl>
## 1 2010-01-01 4048493
## 2 2010-02-01 4130625
## 3 2010-03-01 4166613
## 4 2010-04-01 4142425
## 5 2010-05-01      NA
## 6 2010-06-01 3982368

and now

sum(is.na(fdata2$TOTAL))
## [1] 1

The ts object for time series

We will now convert the table to a time series object. We need to provide the column (or columns) containing the values and we also need to describe the time indexing. To do this we provide the starting date and the number of observations per unit of time. The starting date is provided as a pair (year, month), or (year, quarter),

The single most important question is: what is the frequency of your data? see Hyndman FPP, The seasonal period in Section 2.1

# start -> year and month
# frequency = 12 -> monthly data
# frequency = 4 -> quarterly data

Now we create the ts object and display the first and last values

y <- ts(fdata$TOTAL, start = c(2010, 1), frequency = 12)
head(y, 30)
##          Jan     Feb     Mar     Apr     May     Jun     Jul     Aug     Sep
## 2010 4048493 4130625 4166613 4142425 4066202 3982368 3908578 3969661 4017763
## 2011 4231003 4299263 4333669 4269360 4189659 4121801 4079742 4130927 4226744
## 2012 4599829 4712098 4750867 4744235 4714122 4615269                        
##          Oct     Nov     Dec
## 2010 4085976 4110294 4100073
## 2011 4360926 4420462 4422359
## 2012
tail(y, 12)
##          Jan     Feb     Mar     Apr     May     Jun     Jul     Aug     Sep
## 2022 3123078 3111684 3108763 3022503 2922991 2880582 2883812 2924240 2941919
##          Oct     Nov     Dec
## 2022 2914892 2881380 2837653

Time Plots

The most basic and useful plot for time series is the time plot:

autoplot(y) +
  ggtitle("Unemployment in Spain") +
  xlab("Year") + ylab("Number unemployed")

or using the R base plotting system:

plot.ts(y, 
        main="Unemployment in Spain",
        xlab="Year",
        ylab="Number unemployed")

Subsetting time series with window

# Select time series time frame
y2 <- y
y <- window(y, start = c(2010,1), end = c(2019,12))

# or directly from the original data:
y <- ts(fdata$TOTAL, start = c(2010,1), end = c(2019,12), frequency = 12)

Now we plot the time series

autoplot(y2, color = "blue") +
  ggtitle("Unemployment in Spain") +
  xlab("Year") + ylab("Number unemployed") + 
  autolayer(y, color = "red")

Decomposition methods

Classical additive decomposition

y_dec_add <- decompose(y, type="additive")

The decomposition object is a list that contains (among other things) the components. To get e.g. the seasonal component:

head(y_dec_add$seasonal, 20)
##               Jan          Feb          Mar          Apr          May
## 2010  108216.0413  140371.9024  127003.0367   53488.8191  -31135.3893
## 2011  108216.0413  140371.9024  127003.0367   53488.8191  -31135.3893
##               Jun          Jul          Aug          Sep          Oct
## 2010 -121222.6022 -157327.1485 -117894.5328  -69106.8198   21074.8654
## 2011 -121222.6022 -157327.1485 -117894.5328                          
##               Nov          Dec
## 2010   46194.2820     337.5459
## 2011

Let us visualize the decomposition

autoplot(y_dec_add) + xlab("Year") +
  ggtitle("Classical additive decomposition")

Classical Multiplicative decomposition

Similarly

y_dec_mult <- decompose(y, type="multiplicative")
autoplot(y_dec_mult) + xlab("Year") +
  ggtitle("Classical multiplicative decomposition")

SEATS decomposition method

library(seasonal)
## 
## Attaching package: 'seasonal'
## The following object is masked from 'package:tibble':
## 
##     view

We obtain the decomposition as follows:

y_dec_seas <- seas(y)

In this case the components are stored in a mts (multi time series) object

head(y_dec_seas$data)
##            final  seasonal seasonaladj   trend irregular adjustfac
## Jan 2010 3963891 1.0213433     3963891 3964972 0.9997271 1.0213433
## Feb 2010 3989900 1.0352703     3989900 3990377 0.9998804 1.0352703
## Mar 2010 4017108 1.0381748     4017108 4016480 1.0001565 1.0372171
## Apr 2010 4040293 1.0243325     4040293 4039090 1.0002978 1.0252783
## May 2010 4056567 1.0023751     4056567 4056696 0.9999683 1.0023751
## Jun 2010 4071217 0.9781762     4071217 4067370 1.0009459 0.9781762

But we can use seasonal(), trendcycle() and remainder() functions to extract the individual components. With seasadj() we can compute the seasonally adjusted time series.

autoplot(y_dec_seas) + xlab("Year") +
  ggtitle("SEATS decomposition")

Compare the seasonal components for different decomposition methods

autoplot(seasonal(y_dec_mult), series = "Multiplicative") +
  forecast::autolayer(seasonal(y_dec_seas), series = "SEATS")

Compare seasonal adjustment components (i.e. subtracting the seasonal component from the raw series)

autoplot(seasadj(y_dec_add), series = "Additive") +
  forecast::autolayer(seasadj(y_dec_mult), series = "Multiplicative") +
  forecast::autolayer(seasadj(y_dec_seas),series = "SEATS")

autoplot(seasadj(y_dec_seas), series = "SEATS")

Seasonal subseries plots

Can be obtained from the decompositions as follows:

ggsubseriesplot(seasonal(y_dec_add)) 

ggsubseriesplot(seasonal(y_dec_mult)) 

ggsubseriesplot(seasonal(y_dec_seas)) 

Seasonal plot with ggseasoonplot

ggseasonplot(y, year.labels=TRUE,continuous=TRUE)

Additional examples and other time series and libraries

library(TSstudio)

Let us load this dataset (run ?US_indicators after loading the data)

data(US_indicators)

Examine the data

glimpse(US_indicators)
## Rows: 528
## Columns: 3
## $ Date                <date> 1976-01-31, 1976-02-29, 1976-03-31, 1976-04-30, 1…
## $ `Vehicle Sales`     <dbl> 885.2, 994.7, 1243.6, 1191.2, 1203.2, 1254.7, 1162…
## $ `Unemployment Rate` <dbl> 8.8, 8.7, 8.1, 7.4, 6.8, 8.0, 7.8, 7.6, 7.4, 7.2, …
str(US_indicators)
## 'data.frame':    528 obs. of  3 variables:
##  $ Date             : Date, format: "1976-01-31" "1976-02-29" ...
##  $ Vehicle Sales    : num  885 995 1244 1191 1203 ...
##  $ Unemployment Rate: num  8.8 8.7 8.1 7.4 6.8 8 7.8 7.6 7.4 7.2 ...
head(US_indicators)
##         Date Vehicle Sales Unemployment Rate
## 1 1976-01-31         885.2               8.8
## 2 1976-02-29         994.7               8.7
## 3 1976-03-31        1243.6               8.1
## 4 1976-04-30        1191.2               7.4
## 5 1976-05-31        1203.2               6.8
## 6 1976-06-30        1254.7               8.0
tail(US_indicators)
##           Date Vehicle Sales Unemployment Rate
## 523 2019-07-31      1443.947               4.0
## 524 2019-08-31      1685.342               3.8
## 525 2019-09-30      1315.632               3.3
## 526 2019-10-31      1380.180               3.3
## 527 2019-11-30      1446.483               3.3
## 528 2019-12-31      1565.023               3.4

Rename variables and order dates

US_indicators <- US_indicators %>% 
  rename( VehicleSales = 'Vehicle Sales', UnemploymentRate= `Unemployment Rate`) %>% 
  arrange(Date)

head(US_indicators)
##         Date VehicleSales UnemploymentRate
## 1 1976-01-31        885.2              8.8
## 2 1976-02-29        994.7              8.7
## 3 1976-03-31       1243.6              8.1
## 4 1976-04-30       1191.2              7.4
## 5 1976-05-31       1203.2              6.8
## 6 1976-06-30       1254.7              8.0
tail(US_indicators)
##           Date VehicleSales UnemploymentRate
## 523 2019-07-31     1443.947              4.0
## 524 2019-08-31     1685.342              3.8
## 525 2019-09-30     1315.632              3.3
## 526 2019-10-31     1380.180              3.3
## 527 2019-11-30     1446.483              3.3
## 528 2019-12-31     1565.023              3.4

Check for complete dates and data

We will try to use the same strategy, but have you noticed any difference with the previous case? Look at the dates.

lubridate::day(US_indicators$Date) %>% head(20) 
##  [1] 31 29 31 30 31 30 31 31 30 31 30 31 31 28 31 30 31 30 31 31

This complicates the strategy of creating a complete date sequence and comparing it with the dates in our data. Since this is monthly data and month is all we care about, we replace all dates with the first day of the month (run ?lubridate::day).

lubridate::day(US_indicators$Date) <- 1
head(US_indicators)
##         Date VehicleSales UnemploymentRate
## 1 1976-01-01        885.2              8.8
## 2 1976-02-01        994.7              8.7
## 3 1976-03-01       1243.6              8.1
## 4 1976-04-01       1191.2              7.4
## 5 1976-05-01       1203.2              6.8
## 6 1976-06-01       1254.7              8.0

Now we proceed as before

date_range <- seq.Date(min(US_indicators$Date), max(US_indicators$Date), by = "months")
head(date_range)
## [1] "1976-01-01" "1976-02-01" "1976-03-01" "1976-04-01" "1976-05-01"
## [6] "1976-06-01"
tail(date_range)
## [1] "2019-07-01" "2019-08-01" "2019-09-01" "2019-10-01" "2019-11-01"
## [6] "2019-12-01"
date_range[!date_range %in% US_indicators$Date] 
## Date of length 0

Check for missing data in the value columns

sum(is.na(US_indicators$VehicleSales))
## [1] 0
sum(is.na(US_indicators$UnemploymentRate))
## [1] 0

Alternatively

all(complete.cases(US_indicators))
## [1] TRUE

One dimensional time series, time plot with TSstudio

tvs <- US_indicators[, c("Date", "VehicleSales")]
str(tvs)
## 'data.frame':    528 obs. of  2 variables:
##  $ Date        : Date, format: "1976-01-01" "1976-02-01" ...
##  $ VehicleSales: num  885 995 1244 1191 1203 ...

We create the ts object as we did before:

library(lubridate)
start_point <- c(year(min(tvs$Date)), month(min(tvs$Date)))
start_point
## [1] 1976    1
tvs_ts <- ts(data = tvs$'VehicleSales',
             start = start_point,
             frequency = 12)

and we do a basic time plot

plot.ts(tvs_ts,
        main = "US Monthly Total Vehicle Sales",
        ylab = "Thousands of Vehicle",
        xlab = "Time"
)

Multiple time series object (mts)

We create it in a similar way:

US_indicators_ts <- ts(data = US_indicators[, c("VehicleSales",
                                                "UnemploymentRate")],
                       start = c(year(min(tvs$Date)),
                                 month(min(tvs$Date))),
                       frequency = 12)
str(US_indicators_ts)
##  Time-Series [1:528, 1:2] from 1976 to 2020: 885 995 1244 1191 1203 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:2] "VehicleSales" "UnemploymentRate"

Change plot type to single and see what happens. When would you use one or the other?

plot.ts(US_indicators_ts,
        plot.type = "multiple",
        main = "US Monthly Vehicle Sales vs. Unemployment Rate",
        xlab = "Time")

Plots with library TSstudio

ts_plot(tvs_ts,
        title = "US Monthly Total Vehicle Sales",
        Ytitle = "Thousands of Vehicle",
        slider = TRUE
)
ts_plot(US_indicators_ts,
        title = "US Monthly Vehicle Sales vs. Unemployment Rate",
        type = "multiple")

Moving Averages

head(US_indicators)
##         Date VehicleSales UnemploymentRate
## 1 1976-01-01        885.2              8.8
## 2 1976-02-01        994.7              8.7
## 3 1976-03-01       1243.6              8.1
## 4 1976-04-01       1191.2              7.4
## 5 1976-05-01       1203.2              6.8
## 6 1976-06-01       1254.7              8.0
tvs_data <- US_indicators[,-3]

head(tvs_data)
##         Date VehicleSales
## 1 1976-01-01        885.2
## 2 1976-02-01        994.7
## 3 1976-03-01       1243.6
## 4 1976-04-01       1191.2
## 5 1976-05-01       1203.2
## 6 1976-06-01       1254.7
tvs_data$MA5 <- slider::slide_dbl(tvs$VehicleSales, mean,
                             .before = 2, .after = 2, .complete = TRUE)

head(tvs_data, 15)
##          Date VehicleSales     MA5
## 1  1976-01-01        885.2      NA
## 2  1976-02-01        994.7      NA
## 3  1976-03-01       1243.6 1103.58
## 4  1976-04-01       1191.2 1177.48
## 5  1976-05-01       1203.2 1211.00
## 6  1976-06-01       1254.7 1167.50
## 7  1976-07-01       1162.3 1140.84
## 8  1976-08-01       1026.1 1126.08
## 9  1976-09-01       1057.9 1092.02
## 10 1976-10-01       1129.4 1071.92
## 11 1976-11-01       1084.4 1060.68
## 12 1976-12-01       1061.8 1067.52
## 13 1977-01-01        969.9 1131.86
## 14 1977-02-01       1092.1 1185.86
## 15 1977-03-01       1451.1 1248.92
tail(tvs_data)
##           Date VehicleSales      MA5
## 523 2019-07-01     1443.947 1525.549
## 524 2019-08-01     1685.342 1475.970
## 525 2019-09-01     1315.632 1454.317
## 526 2019-10-01     1380.180 1478.532
## 527 2019-11-01     1446.483       NA
## 528 2019-12-01     1565.023       NA
tvs_MA_ts <- ts(data = tvs_data[,-1],
              start = c(year(min(tvs$Date)),
                        month(min(tvs$Date))),
              frequency = 12)
  
autoplot(tvs_MA_ts)

tvs_data$MA12 <- slider::slide_dbl(tvs$VehicleSales, mean,
                               .before = 5, .after = 6, .complete = TRUE)

tvs_data$MA2x12 <- slider::slide_dbl(tvs_data$MA12, mean,
                                    .before = 1, .after = 0, .complete = TRUE)


head(tvs_data)
##         Date VehicleSales     MA5     MA12 MA2x12
## 1 1976-01-01        885.2      NA       NA     NA
## 2 1976-02-01        994.7      NA       NA     NA
## 3 1976-03-01       1243.6 1103.58       NA     NA
## 4 1976-04-01       1191.2 1177.48       NA     NA
## 5 1976-05-01       1203.2 1211.00       NA     NA
## 6 1976-06-01       1254.7 1167.50 1107.875     NA
tvs_MA_ts <- ts(data = tvs_data[,-1],
                start = c(year(min(tvs$Date)),
                          month(min(tvs$Date))),
                frequency = 12)

head(tvs_MA_ts)
##          VehicleSales     MA5     MA12 MA2x12
## Jan 1976        885.2      NA       NA     NA
## Feb 1976        994.7      NA       NA     NA
## Mar 1976       1243.6 1103.58       NA     NA
## Apr 1976       1191.2 1177.48       NA     NA
## May 1976       1203.2 1211.00       NA     NA
## Jun 1976       1254.7 1167.50 1107.875     NA
autoplot(tvs_MA_ts[ , 4], color = "yellow", size = 3, alpha = 0.5) + 
  autolayer(decompose(tvs_ts, type="additive")$trend)
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).